This notebook will likely be broken into at least two
#| default_exp dna
# !cd ../ && pip install -e '.[dev]'
import os
import numpy as np
import pandas as pd
import plotly.express as px
import hilbertcurve
from hilbertcurve.hilbertcurve import HilbertCurve
# !conda install openpyxl -y
# ! conda install h5py -y
# ! pip install hilbertcurve
Working with these records proved tricky. Ultimately I need the nucleotide data in a tensor, but after using tassel to save the data (ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023) as a table (along with position list and taxa list) it's too big to easily load (>30Gb). As a work around to easily access specific genomes, I split the table into a separate file for the header and each genome so that these files can be read piecemeal. See the Readme below for more details.
#| export
def read_txt(path,
**kwargs # Intended to allow for explicit 'encoding' to be passed into open the file
):
if 'encoding' in kwargs.keys():
print(kwargs)
with open(path, 'r', encoding = kwargs['encoding']) as f:
data = f.read()
else:
with open(path, 'r') as f:
data = f.read()
return(data)
#| export
def print_txt(path):
print(read_txt(path = path))
AGPv4_path = '../data/zma/panzea/genotypes/GBS/v27/'
print_txt(path = AGPv4_path+'Readme')
Getting a script to split the table by line and rename all to the taxa name didn't work. It's possible that this is from sed, but it's not worth debugging. Instead I'm doing this manually which is what I should have done to begin with. 1. use split to produce the needed files split -l 1 ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table.txt 2. move those to their own folder mv xz* GBSv27_publicSamples_imputedV5_AGPv4-181023_Table/ 3. go over all files in the folder, pull out column one, swap out the : for another character and rename it.
This last point was completed with the following shell script.
print_txt(path = AGPv4_path+'rename_all.sh')
#!/usr/bin/bash
files_path='./ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/'
out_path='./ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/'
#out_path='./out/'
files=$(ls "$files_path")
#echo $files
for file in $files
do
#echo $file
taxa=$(awk '{print $1}' <<< cat "$files_path$file")
taxa_sub=$(sed -r 's/[:]/__/g' <<< "$taxa")
#echo $taxa_sub
cp $files_path$file $out_path$taxa_sub
#echo $taxa
done
With that done, and with the summary files from tassel (position and taxa), the genomes can be individually loaded as needed.
# Other than listing the taxa this isn't expected to be of much use for our purposes.
AGPv4_taxa=pd.read_table(AGPv4_path+'ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_TaxaList.txt')
AGPv4_taxa.head()
| Taxa | LibraryPrepID | Status | DNAPlate | GENUS | INBREEDF | SPECIES | DNASample | Flowcell_Lane | NumLanes | ... | GermplasmSet | Barcode | LibraryPlate | Tassel4SampleName | Population | LibraryPlateWell | SampleDNAWell | OwnerEmail | PEDIGREE | SeedLot | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 05-397:250007467 | 250007467 | public | P3-GS-F | Zea | 0.95 | mays | 05-397 | C00R8ABXX_4 | 1.0 | ... | Margaret Smith lines | GTTGAA | P3-GS-F | 05-397:C00R8ABXX:4:250007467 | Inbred | F11 | F11 | esb33@cornell.edu | NaN | NaN |
| 1 | 05-438:250007407 | 250007407 | public | P3-GS-F | Zea | 0.95 | mays | 05-438 | C00R8ABXX_4 | 1.0 | ... | Margaret Smith lines | GTATT | P3-GS-F | 05-438:C00R8ABXX:4:250007407 | Inbred | B03 | B03 | esb33@cornell.edu | NaN | NaN |
| 2 | 12E:250032344 | 250032344 | public | Ames12 | Zea | 0.95 | mays | 12E | 81FE8ABXX_4 | 1.0 | ... | Ames | GCTGTGGA | Ames12 | 12E:81FE8ABXX:4:250032344 | inbred | H08 | H08 | esb33@cornell.edu | 12E | NaN |
| 3 | 207:250007202 | 250007202 | public | P1-GS-F | Zea | 0.95 | mays | 207 | C00R8ABXX_2 | 1.0 | ... | Margaret Smith lines | TACAT | P1-GS-F | 207:C00R8ABXX:2:250007202 | Inbred | E12 | E12 | esb33@cornell.edu | NaN | NaN |
| 4 | 22612:250007466 | 250007466 | public | P3-GS-F | Zea | 0.95 | mays | 22612 | C00R8ABXX_4 | 1.0 | ... | Margaret Smith lines | GTACTT | P3-GS-F | 22612:C00R8ABXX:4:250007466 | Inbred | F10 | F10 | esb33@cornell.edu | NaN | NaN |
5 rows × 21 columns
# Useful for converting between the physical location and site
AGPv4_site = pd.read_table(AGPv4_path+'ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_PositionList.txt')
AGPv4_site.head()
| Site | Name | Chromosome | Position | |
|---|---|---|---|---|
| 0 | 0 | S1_6370 | 1 | 52399 |
| 1 | 1 | S1_8210 | 1 | 54239 |
| 2 | 2 | S1_8376 | 1 | 54405 |
| 3 | 3 | S1_9889 | 1 | 55917 |
| 4 | 4 | S1_9899 | 1 | 55927 |
Retrieving a genome by taxa name:
# The genomes are in a folder with an identical name as their source table
table_directory = AGPv4_path+'ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/'
# Note however that the naming is altered to not use ':'
os.listdir(table_directory)[0:3]
['xztea', 'xzedh', 'Z020E0024__250026132']
#| export
def taxa_to_filename(taxa = '05-397:250007467'): return(taxa.replace(':', '__'))
taxa_to_filename(taxa = '05-397:250007467')
'05-397__250007467'
#| export
def find_AGPv4(
taxa, # should be the desired taxa or a regex fragment (stopping before the __). E.g. 'B73' or 'B\d+'
**kwargs # optionally pass in a genome list (this allows for a different path or precomputing if we're finding a lot of genomes)
):
"Search for existing marker sets __"
if 'genome_files' not in kwargs.keys():
import os
genome_files = os.listdir(
'../data/zma/panzea/genotypes/GBS/v27/ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/')
else:
genome_files = kwargs['genome_files']
import re
return( [e for e in genome_files if re.match(taxa+'__.+', e)] )
#| export
def get_AGPv4(
taxa,
**kwargs
):
"Retrieve an existing marker set"
if 'table_directory' not in kwargs.keys():
table_directory = '../data/zma/panzea/genotypes/GBS/v27/ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/'
else:
table_directory = kwargs['table_directory']
with open(table_directory+taxa, 'r') as f:
data = f.read()
data = data.split('\t')
return(data)
# def get_AGPv4(
# taxa,
# table_directory = '../data/zma/panzea/genotypes/GBS/v27/ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/'
# ):
# with open(table_directory+taxa, 'r') as f:
# data = f.read()
# data = data.split('\t')
# return(data)
get_AGPv4('05-397__250007467')[0:4]
['05-397:250007467', 'T', 'T', 'A']
In addition to returning a specific taxa, the table's headers can be retieved with "taxa".
get_AGPv4(taxa = 'taxa')[0:4]
['Taxa', '52399', '54239', '54405']
Converting between site and chromosome/position requires the AGPv4_site dataframe. A given record contains the taxa as well as the nucleotides, so with that entry excluded the chromosome / position can be paired up.
len(get_AGPv4(taxa = 'taxa')), AGPv4_site.shape
(943456, (943455, 4))
ith_taxa = '05-397:250007467'
res = get_AGPv4(taxa_to_filename(taxa = ith_taxa)) # Retrieve record
temp = AGPv4_site.loc[:, ['Chromosome', 'Position']]
temp[res[0]] = res[1:] # Add Col. with Nucleotides
temp.head()
| Chromosome | Position | 05-397:250007467 | |
|---|---|---|---|
| 0 | 1 | 52399 | T |
| 1 | 1 | 54239 | T |
| 2 | 1 | 54405 | A |
| 3 | 1 | 55917 | N |
| 4 | 1 | 55927 | N |
mask = (temp.Chromosome == 1)
temp_pos = temp.loc[mask, ['Position']]
temp_pos['Shift'] = 0
temp_pos.loc[1: , ['Shift']] = np.array(temp_pos.Position)[:-1]
temp_pos['Diff'] = temp_pos['Position'] - temp_pos['Shift']
temp_pos.loc[0, 'Diff'] = None
temp_pos
| Position | Shift | Diff | |
|---|---|---|---|
| 0 | 52399 | 0 | NaN |
| 1 | 54239 | 52399 | 1840.0 |
| 2 | 54405 | 54239 | 166.0 |
| 3 | 55917 | 54405 | 1512.0 |
| 4 | 55927 | 55917 | 10.0 |
| ... | ... | ... | ... |
| 147145 | 306971046 | 306910117 | 60929.0 |
| 147146 | 306971061 | 306971046 | 15.0 |
| 147147 | 306971063 | 306971061 | 2.0 |
| 147148 | 306971073 | 306971063 | 10.0 |
| 147149 | 306971080 | 306971073 | 7.0 |
147150 rows × 3 columns
# px.histogram(temp_pos, x = 'Diff')
res = get_AGPv4(taxa_to_filename(taxa = '05-397:250007467'))
res = res[1:] # drop taxa
#| export
def list_to_ACGT(
in_seq, # This should be a list with strings corresponding to IUPAC codes e.g. ['A', 'C', 'Y']
progress = False
):
import numpy as np
import tqdm
from tqdm import tqdm
# Convert IUPAC codes into pr ACGT -------------------------------------------
encode_dict = {
# https://www.bioinformatics.org/sms/iupac.html
# A C G T
'A': [1, 0, 0, 0 ],
'C': [0, 1, 0, 0 ],
'G': [0, 0, 1, 0 ],
'T': [0, 0, 0, 1 ],
'K': [0, 0, 0.5, 0.5 ],
'M': [0.5, 0.5, 0, 0 ],
'N': [0.25, 0.25, 0.25, 0.25],
'R': [0.5, 0, 0.5, 0 ],
'S': [0, 0.5, 0.5, 0 ],
'W': [0.5, 0, 0, 0.5 ],
'Y': [0, 0.5, 0, 0.5 ],
# Other values (assumed empty)
# A C G T
'': [0, 0, 0, 0 ],
'-': [0, 0, 0, 0 ],
'0': [0, 0, 0, 0 ],
}
# Cleanup --
# Any newlines need to be removed
in_seq = [e.replace('\n', '') for e in in_seq]
# Check if there's anything that should be in the dictionary but is not.
not_in_dict = [e for e in list(set(in_seq)) if e not in list(encode_dict.keys())]
if not_in_dict != []:
print("Waring: The following are not in the encoding dictionary and will be set as missing.\n"+str(not_in_dict))
in_seq = [e if e not in not_in_dict else '' for e in in_seq]
# output matrix
GMat = np.zeros(shape = [len(in_seq), 4])
# convert all nucleotides to probabilities
if progress == True:
for nucleotide in tqdm(encode_dict.keys()):
mask = [True if e == nucleotide else False for e in in_seq]
GMat[mask, :] = encode_dict[nucleotide]
else:
for nucleotide in encode_dict.keys():
mask = [True if e == nucleotide else False for e in in_seq]
GMat[mask, :] = encode_dict[nucleotide]
return(GMat)
res = list_to_ACGT(in_seq = res)
res = res[0:1000]
100%|███████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 14.64it/s]
res.shape
(1000, 4)
#| export
def calc_needed_hilbert_p(n_needed = 1048576,
max_p = 20):
out = None
for i in range(1, max_p):
if 4**i > n_needed:
out = i
break
return(out)
#| export
def np_2d_to_hilbert(
in_seq # This should be a 2d numpy array with dimensions of [sequence, channels]
):
import numpy as np
import tqdm
from tqdm import tqdm
import hilbertcurve
from hilbertcurve.hilbertcurve import HilbertCurve
import dlgwas
from dlgwas.dna import calc_needed_hilbert_p
n_snps = in_seq.shape[0]
n_channels = in_seq.shape[-1]
temp = in_seq
p_needed = calc_needed_hilbert_p(n_needed=n_snps)
# Data represented need not be continuous -- it need only have int positions
# a sequence or a sequence with gaps can be encoded
hilbert_curve = HilbertCurve(
p = p_needed, # iterations i.e. hold 4^p positions
n = 2 # dimensions
)
points = hilbert_curve.points_from_distances(range(n_snps))
dim_0 = np.max(np.array(points)[:, 0])+1 # add 1 to account for 0 indexing
dim_1 = np.max(np.array(points)[:, 1])+1
temp_mat = np.zeros(shape = [dim_0, dim_1, n_channels])
temp_mat[temp_mat == 0] = np.nan # empty values being used for visualization
for i in tqdm(range(n_snps)):
temp_mat[points[i][0], points[i][1], :] = temp[i]
return(temp_mat)
#| export
def np_3d_to_hilbert(
in_seq # This should be a 3d numpy array with dimensions of [samples, sequence, channels]
):
"This is the 3d version of `np_2d_to_hilbert`. The goal is to process all of the samples of an array in one go."
import numpy as np
import tqdm
from tqdm import tqdm
import hilbertcurve
from hilbertcurve.hilbertcurve import HilbertCurve
import dlgwas
from dlgwas.dna import calc_needed_hilbert_p
n_snps = in_seq.shape[1]
n_channels = in_seq.shape[-1]
temp = in_seq
p_needed = calc_needed_hilbert_p(n_needed=n_snps)
# Data represented need not be continuous -- it need only have int positions
# a sequence or a sequence with gaps can be encoded
hilbert_curve = HilbertCurve(
p = p_needed, # iterations i.e. hold 4^p positions
n = 2 # dimensions
)
points = hilbert_curve.points_from_distances(range(n_snps))
dim_0 = np.max(np.array(points)[:, 0])+1 # add 1 to account for 0 indexing
dim_1 = np.max(np.array(points)[:, 1])+1
temp_mat = np.zeros(shape = [in_seq.shape[0], dim_0, dim_1, n_channels])
temp_mat[temp_mat == 0] = np.nan # empty values being used for visualization
for i in tqdm(range(n_snps)):
temp_mat[:, # sample
points[i][0], points[i][1], # x, y
:] = temp[:, i] # channels
return(temp_mat)
demo = np_2d_to_hilbert(
in_seq = np.asarray([np.linspace(1, 100, num= 50),
np.linspace(100, 1, num= 50)]).T
)
100%|███████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 168852.82it/s]
px.imshow(demo[:,:,0])
px.imshow(demo[:,:,1])
Explictly convert a taxa
taxa_to_filename(taxa = '05-397:250007467')
'05-397__250007467'
Or search for a taxa
find_AGPv4(taxa = '05-397')
['05-397__250007467']
Retrieve the sequence data
res = get_AGPv4(taxa_to_filename(taxa = '05-397:250007467'))
res = res[1:] # drop taxa
res[0:10]
['T', 'T', 'A', 'N', 'N', 'N', 'N', 'C', 'C', 'N']
Convert from characters to encoded nucleotide probabilities
res = list_to_ACGT(in_seq = res)
res = res[0:1000]
res
100%|███████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 15.00it/s]
array([[0., 0., 0., 1.],
[0., 0., 0., 1.],
[1., 0., 0., 0.],
...,
[0., 0., 0., 1.],
[1., 0., 0., 0.],
[0., 1., 0., 0.]])
Convert the sequence to a hilbert curve
# This will happen under the hood
# calc_needed_hilbert_p(n_needed=res.shape[0])
res_hilb = np_2d_to_hilbert(
in_seq = res
)
100%|██████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1209081.58it/s]
px.imshow( res[0:20, 0:1] )
px.imshow( res_hilb[:, :, 0] )
px.imshow( res_hilb[:, :, 1] )
#| hide
import nbdev; nbdev.nbdev_export()